from pyspark.sql import SparkSessionfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql.functions import col, monotonically_increasing_idfrom pyspark.sql.types import StructType # to/from JSONimport jsonimport reimport numpy as npimport pandas as pdimport plotly.express as pximport plotly.io as pioimport plotly.graph_objects as gonp.random.seed(30) # set a fixed seed for reproducibilitypio.renderers.default ="vscode+notebook"## Initialize Spark Sessionspark = SparkSession.builder.appName("JobPostingsAnalysis").getOrCreate()# Load schema from JSON filewithopen("data/schema_lightcast.json") as f: schema = StructType.fromJson(json.load(f))# Load Datadf = (spark.read .option("header", "true") .option("inferSchema", "false") .schema(schema) # saved schema .option("multiLine", "true") .option("escape", "\"") .csv("data/lightcast_job_postings.csv") .limit(5000))df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data#df.printSchema() df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/23 03:11:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/23 03:11:24 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 0:> (0 + 1) / 1]
1.1 Salary Distribution by Industry and Employment Type
Compare salary variations across industries.
Filter the dataset - Remove records where salary is missing or zero.
Aggregate Data - Group by NAICS industry codes (e.g., NAICS2_NAME). - Group by employment type (EMPLOYMENT_TYPE_NAME) and compute salary distribution. - Calculate salary percentiles (25th, 50th, 75th) for each group.
Visualize results - Create a box plot where: - X-axis = NAICS2_NAME - Y-axis = SALARY_FROM, or SALARY_TO, or SALARY - Group/color = EMPLOYMENT_TYPE_NAME - Customize colors, fonts, and styles.
Explanation: Write two sentences about what the graph reveals (e.g., median differences across industries and dispersion by employment type).
2 Set up plotly template
3 Salary Distribution by Industry and Employment Type
Compare salary variations across industries.
Filter the dataset
Remove records where Salary is missing or zero.
Aggregate Data
Group by NAICS industry codes.
Group by employment type and compute salary-distribution.
Visualize results
Create a box plot where:
X-axis = NAICS2_NAME.
Y-axis = SALARY_FROM.
Group by EMPLOYMENT_TYPE_NAME.
Customize colors, fonts, and styles.
Explanation: Write two sentences about what the graph reveals.
4 Salary Analysis by ONET Occupation Type (Bubble Chart)
Analyze how salaries differ across ONET occupation types.
Aggregate Data
Compute median salary for each occupation in the ONET taxonomy.
Visualize results
Create a bubble chart where:
X-axis = ONET_NAME
Y-axis = Median Salary
Size = Number of job postings
Apply custom colors and font styles.
Explanation: Write two sentences about what the graph reveals.
5 Salary by Education Level
Create two groups:
Bachelor’s or lower (Bachelor’s, GED, Associate, No Education Listed)
Master’s or PhD (Master’s degree, Ph.D. or professional degree)
Plot scatter plots for each group using MAX_YEARS_EXPERIENCE (with jitter), Average_Salary, LOT_V6_SPECIALIZED_OCCUPATION_NAME.
Then, plot histograms overlaid with KDE curves for each group.
This would generate two scatter plots and two histograms.
After each graph, add a short explanation of key insights.
6 Salary by Remote Work Type
Split into three groups based on REMOTE_TYPE_NAME:
Remote
Hybrid
Onsite (includes [None] and blank)
Plot scatter plots for each group using MAX_YEARS_EXPERIENCE (with jitter), Average_Salary, LOT_V6_SPECIALIZED_OCCUPATION_NAME.
Also, create salary histograms for all three groups.
After each graph, briefly describe any patterns or comparisons.
Submission Instructions
Submit the Word Document (part of git repo) containing:
The HTTPS URL of your GitHub repository.
Answer to the questions.
Visualizations created using matplotlib, Seaborn or plotly (preferred).
Answers to the questions below.
# Step 1: Spark SQL - Median salary and job count per TITLE_NAMEsalary_analysis = spark.sql(""" SELECT LOT_OCCUPATION_NAME AS Occupation_name, PERCENTILE(SALARY, 0.5) AS Median_Salary, COUNT(*) AS Job_Postings FROM job_postings GROUP BY LOT_OCCUPATION_NAME ORDER BY Job_Postings DESC LIMIT 10""")# Step 2: Convert to Pandas DataFramesalary_pd = salary_analysis.toPandas()salary_pd.head()# Step 3: Bubble chart using Plotlyimport plotly.express as pxfig = px.scatter( salary_pd, x="Occupation_name", y="Median_Salary", size="Job_Postings", title="Salary Analysis by LOT Occupation Type (Bubble Chart)", labels={"Occupation_name": "LOT Occupation","Median_Salary": "Median Salary","Job_Postings": "Number of Job Postings" }, hover_name="Occupation_name", size_max=60, width=1000, height=600, color="Job_Postings", color_continuous_scale="Plasma")# Step 4: Layout customizationfig.update_layout( font_family="Arial", font_size=14, title_font_size=25, xaxis_title="LOT Occupation", yaxis_title="Median Salary", plot_bgcolor="white", xaxis=dict( tickangle=-45, showline=True, linecolor="black" ), yaxis=dict( showline=True, linecolor="black" ))# Step 5: Show and exportfig.show()fig.write_image("output/Q7.svg", width=1000, height=600, scale=1)
[Stage 16:> (0 + 1) / 1]
# Defining education level groupingslower_deg = ["Bachelor's", "Associate", "GED", "No Education Listed", "High school"]higher_deg = ["Master's degree", "PhD or professional degree"]# Adding EDU_GROUP columndf = df.withColumn("EDU_GROUP", when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in lower_deg])), "Bachelor's or lower") .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in higher_deg])), "Master's or PhD") .otherwise("Other"))# Casting necessary columns to floatdf = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))# Filtering for non-null and positive valuesdf = df.filter( col("MAX_YEARS_EXPERIENCE").isNotNull() & col("Average_Salary").isNotNull() & (col("MAX_YEARS_EXPERIENCE") >0) & (col("Average_Salary") >0))# Filtering for just the two education groupsdf_filtered = df.filter(col("EDU_GROUP").isin("Bachelor's or lower", "Master's or PhD"))# Converting to Pandas for plottingdf_pd = df_filtered.toPandas()df_pd.head()